Skip to content

Conversation

doru1004
Copy link
Contributor

@doru1004 doru1004 commented Oct 9, 2025

Sinking ShuffleVectors / ExtractElement / InsertElement into user blocks can help enable SDAG combines by providing visibility to the values instead of emitting CopyTo/FromRegs. The sink IR pass disables sinking into loops, so this PR extends the CodeGenPrepare target hook shouldSinkOperands.

Author: Jeffrey Byrnes

@llvmbot
Copy link
Member

llvmbot commented Oct 9, 2025

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Gheorghe-Teodor Bercea (doru1004)

Changes

Sinking ShuffleVectors / ExtractElement / InsertElement into user blocks can help enable SDAG combines by providing visibility to the values instead of emitting CopyTo/FromRegs. The sink IR pass disables sinking into loops, so this PR extends the CodeGenPrepare target hook shouldSinkOperands.


Patch is 119.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162580.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+84)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll (+60-60)
  • (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+423-436)
  • (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+382-434)
  • (modified) llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll (-2)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 03d16fdd54c42..def8ba47c9c37 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1301,6 +1301,90 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
 
     if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
       Ops.push_back(&Op);
+
+    // Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
+    // will be optimized away, and sinking them can help SDAG combines.
+    DataLayout DL = I->getModule()->getDataLayout();
+    auto IsFreeExtractInsert = [&DL, this](VectorType *VecType,
+                                           unsigned VecIndex) {
+      unsigned EltSize = DL.getTypeSizeInBits(VecType->getElementType());
+      return EltSize >= 32 ||
+             (EltSize == 16 && VecIndex == 0 && ST->has16BitInsts());
+    };
+
+    uint64_t VecIndex;
+    Value *Vec;
+    if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) {
+      Instruction *VecOpInst =
+          dyn_cast<Instruction>(cast<Instruction>(Op.get())->getOperand(0));
+      // If a zero cost extractvector instruction is the only use of the vector,
+      // then it may be combined with the def.
+      if (VecOpInst && VecOpInst->hasOneUse())
+        continue;
+
+      if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
+        Ops.push_back(&Op);
+
+      continue;
+    }
+
+    if (match(Op.get(),
+              m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) {
+      if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
+        Ops.push_back(&Op);
+
+      continue;
+    }
+
+    if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
+      if (Shuffle->isIdentity()) {
+        Ops.push_back(&Op);
+        continue;
+      }
+
+      unsigned EltSize = DL.getTypeSizeInBits(
+          cast<VectorType>(cast<VectorType>(Shuffle->getType()))
+              ->getElementType());
+
+      // For i32 (or greater) shufflevectors, these will be lowered into a
+      // series of insert / extract elements, which will be coalesced away.
+      if (EltSize >= 32) {
+        Ops.push_back(&Op);
+        continue;
+      }
+
+      if (EltSize < 16 || !ST->has16BitInsts())
+        continue;
+
+      int NumSubElts, SubIndex;
+      if (Shuffle->changesLength()) {
+        if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
+          Ops.push_back(&Op);
+          continue;
+        }
+
+        if (Shuffle->isExtractSubvectorMask(SubIndex) ||
+            Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
+          if (!(SubIndex % 2)) {
+            Ops.push_back(&Op);
+            continue;
+          }
+        }
+      }
+
+      if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
+          Shuffle->isSingleSource()) {
+        Ops.push_back(&Op);
+        continue;
+      }
+
+      if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
+        if (!(SubIndex % 2)) {
+          Ops.push_back(&Op);
+          continue;
+        }
+      }
+    }
   }
 
   return !Ops.empty();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 302b2395642d0..74b31913abb7e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -2149,11 +2149,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB11_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
 ; CI-NEXT:    s_and_b32 s6, s2, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v1, s4
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
-; CI-NEXT:    v_mov_b32_e32 v1, s6
-; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT:    v_mov_b32_e32 v0, s6
+; CI-NEXT:    v_mov_b32_e32 v1, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CI-NEXT:    s_mov_b32 s6, 0
 ; CI-NEXT:  .LBB11_2: ; %Flow53
 ; CI-NEXT:    s_xor_b32 s6, s6, 1
@@ -2224,11 +2224,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB11_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else16
 ; CI-NEXT:    s_and_b32 s6, s3, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v2, s5
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s3|, |v2|
-; CI-NEXT:    v_mov_b32_e32 v2, s6
-; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT:    v_mov_b32_e32 v1, s5
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s3|, |v1|
+; CI-NEXT:    v_mov_b32_e32 v1, s6
+; CI-NEXT:    v_mov_b32_e32 v2, s3
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    s_mov_b32 s6, 0
 ; CI-NEXT:  .LBB11_10: ; %Flow49
 ; CI-NEXT:    s_xor_b32 s6, s6, 1
@@ -2322,11 +2322,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_cbranch_vccz .LBB11_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
 ; VI-NEXT:    s_and_b32 s6, s2, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v1, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
-; VI-NEXT:    v_mov_b32_e32 v1, s6
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; VI-NEXT:    s_mov_b32 s6, 0
 ; VI-NEXT:  .LBB11_2: ; %Flow53
 ; VI-NEXT:    s_xor_b32 s6, s6, 1
@@ -2397,11 +2397,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_cbranch_vccz .LBB11_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else16
 ; VI-NEXT:    s_and_b32 s6, s3, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v2, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s3|, |v2|
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s3|, |v1|
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    s_mov_b32 s6, 0
 ; VI-NEXT:  .LBB11_10: ; %Flow49
 ; VI-NEXT:    s_xor_b32 s6, s6, 1
@@ -2503,11 +2503,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB12_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
 ; CI-NEXT:    s_and_b32 s2, s4, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v1, s8
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s4|, |v1|
-; CI-NEXT:    v_mov_b32_e32 v1, s2
-; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s8
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s4|, |v0|
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s4
+; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:  .LBB12_2: ; %Flow127
 ; CI-NEXT:    s_xor_b32 s2, s2, 1
@@ -2578,11 +2578,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB12_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else16
 ; CI-NEXT:    s_and_b32 s2, s5, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v2, s9
-; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s5|, |v2|
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT:    v_mov_b32_e32 v1, s9
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s5|, |v1|
+; CI-NEXT:    v_mov_b32_e32 v1, s2
+; CI-NEXT:    v_mov_b32_e32 v2, s5
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:  .LBB12_10: ; %Flow123
 ; CI-NEXT:    s_xor_b32 s2, s2, 1
@@ -2653,11 +2653,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB12_18
 ; CI-NEXT:  ; %bb.17: ; %frem.else47
 ; CI-NEXT:    s_and_b32 s2, s6, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v3, s10
-; CI-NEXT:    v_mov_b32_e32 v2, s6
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s6|, |v3|
-; CI-NEXT:    v_mov_b32_e32 v3, s2
-; CI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-NEXT:    v_mov_b32_e32 v2, s10
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s6|, |v2|
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s6
+; CI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:  .LBB12_18: ; %Flow119
 ; CI-NEXT:    s_xor_b32 s2, s2, 1
@@ -2728,11 +2728,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB12_26
 ; CI-NEXT:  ; %bb.25: ; %frem.else78
 ; CI-NEXT:    s_and_b32 s2, s7, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v4, s11
-; CI-NEXT:    v_mov_b32_e32 v3, s7
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s7|, |v4|
-; CI-NEXT:    v_mov_b32_e32 v4, s2
-; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s11
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s7|, |v3|
+; CI-NEXT:    v_mov_b32_e32 v3, s2
+; CI-NEXT:    v_mov_b32_e32 v4, s7
+; CI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:  .LBB12_26: ; %Flow115
 ; CI-NEXT:    s_xor_b32 s2, s2, 1
@@ -2834,11 +2834,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_cbranch_vccz .LBB12_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
 ; VI-NEXT:    s_and_b32 s2, s4, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v1, s8
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s4|, |v1|
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s4|, |v0|
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; VI-NEXT:    s_mov_b32 s2, 0
 ; VI-NEXT:  .LBB12_2: ; %Flow127
 ; VI-NEXT:    s_xor_b32 s2, s2, 1
@@ -2909,11 +2909,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_cbranch_vccz .LBB12_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else16
 ; VI-NEXT:    s_and_b32 s2, s5, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v2, s9
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s5|, |v2|
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s5|, |v1|
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    s_mov_b32 s2, 0
 ; VI-NEXT:  .LBB12_10: ; %Flow123
 ; VI-NEXT:    s_xor_b32 s2, s2, 1
@@ -2984,11 +2984,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_cbranch_vccz .LBB12_18
 ; VI-NEXT:  ; %bb.17: ; %frem.else47
 ; VI-NEXT:    s_and_b32 s2, s6, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v3, s10
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s6|, |v3|
-; VI-NEXT:    v_mov_b32_e32 v3, s2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s6|, |v2|
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s6
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; VI-NEXT:    s_mov_b32 s2, 0
 ; VI-NEXT:  .LBB12_18: ; %Flow119
 ; VI-NEXT:    s_xor_b32 s2, s2, 1
@@ -3059,11 +3059,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_cbranch_vccz .LBB12_26
 ; VI-NEXT:  ; %bb.25: ; %frem.else78
 ; VI-NEXT:    s_and_b32 s2, s7, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v4, s11
-; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s7|, |v4|
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s7|, |v3|
+; VI-NEXT:    v_mov_b32_e32 v3, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s7
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; VI-NEXT:    s_mov_b32 s2, 0
 ; VI-NEXT:  .LBB12_26: ; %Flow115
 ; VI-NEXT:    s_xor_b32 s2, s2, 1
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 78a961ea0da17..d75d2597685d6 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -5783,11 +5783,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v3
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB9_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %frem.else
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, v5, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, v2, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX11-TRUE16-NEXT:    s_branch .LBB9_8
@@ -6221,12 +6221,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s6, s5
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX1150-TRUE16-NEXT:  ; %bb.1: ; %frem.else
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s4
 ; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s6, s5
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1150-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.l, s4, v0.l, s7
 ; GFX1150-TRUE16-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1150-TRUE16-NEXT:    s_branch .LBB9_8
@@ -6691,12 +6691,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    s_cmp_ngt_f32 s6, s5
 ; GFX1200-TRUE16-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX1200-TRUE16-NEXT:  ; %bb.1: ; %frem.else
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s4
 ; GFX1200-TRUE16-NEXT:    s_cmp_eq_f32 s6, s5
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1200-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v0.l, s4, v0.l, s7
 ; GFX1200-TRUE16-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1200-TRUE16-NEXT:    s_branch .LBB9_8
@@ -8964,11 +8964,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v6, v5
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB10_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %frem.else
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v6, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0x7fff, v7, v4
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0x7fff, v4, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.l, v4.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX11-TRUE16-NEXT:    s_branch .LBB10_8
@@ -9805,12 +9805,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s8, s6
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc0 .LBB10_2
 ; GFX1150-TRUE16-NEXT:  ; %bb.1: ; %frem.else
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s5
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s5
 ; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s8, s6
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s9, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1150-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.l, s5, v0.l, s9
 ; GFX1150-TRUE16-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1150-TRUE16-NEXT:    s_branch .LBB10_8
@@ -10713,12 +10713,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    s_cmp_ngt_f32 s8, s6
 ; GFX1200-TRUE16-NEXT:    s_cbranch_scc0 .LBB10_2
 ; GFX1200-TRUE16-NEXT:  ; %bb.1: ; %frem.else
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s5
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s5
 ; GFX1200-TRUE16-NEXT:    s_cmp_eq_f32 s8, s6
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s9, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1200-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v0.l, s5, v0.l, s9
 ; GFX1200-TRUE16-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1200-TRUE16-NEXT:    s_branch .LBB10_8
@@ -12714,18 +12714,18 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150:       ; %bb.0:
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1150-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX1150-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX1150-NEXT:    global_load_b64 v[1:2], v2, s[6:7] offset:32
 ; GFX1150-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1150-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX1150-NEXT:    global_load_b64 v[0:1], v2, s[8:9] offset:32
 ; GFX1150-NEXT:    s_and_b32 s3, s6, 0x7fffffff
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX1150-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX1150-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1150-NEXT:    s_and_b32 s8, s4, 0x7fffffff
 ; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1150-NEXT:    s_cmp_ngt_f32 s3, s8
@@ -12933,232 +12933,221 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-LABEL: frem_v2f32:
 ; GFX1200:       ; %bb.0:
 ; GFX1200-NEXT:    s_clause 0x1
-; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1200-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
-; GFX1200-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1200-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1200-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX1200-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0
-; GFX1200-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
-; GFX1200-NEXT:    s_wait_loadcnt 0x0
-; GFX1200-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX1200-NEXT:    global_load_b64 v[1:2], v2, s[6:7] offset:32
-; GFX1200-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1200-NEXT:    s_and_b32 s3, s6, 0x7fffffff
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_load_b64 v[2:3], v0, s[10:11]
+; GFX1200-NEXT:    global_load_b64 v[0:1], v0, s[0:1] offset:32
+; GFX1200-NEXT:    s_wait_loadcnt 0x1
+; GFX1200-NEXT:    v_and_b32_e32 v4, 0x7fffffff, v2
 ; GFX1200-NEXT:    s_wait_loadcnt 0x0
-; GFX1200-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX1200-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1200-NEXT:    s_and_b32 s8, s4, 0x7fffffff
-; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1200-NEXT:    s_cmp_ngt_f32 s3, s8
-; GFX1200-NE...
[truncated]

@doru1004 doru1004 requested review from arsenm and shiltian October 9, 2025 01:53
Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs dedicated IR test


// Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
// will be optimized away, and sinking them can help SDAG combines.
DataLayout DL = I->getModule()->getDataLayout();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Never copy a DataLayout

Suggested change
DataLayout DL = I->getModule()->getDataLayout();
const DataLayout &DL = I->getModule()->getDataLayout();

Comment on lines +1308 to +1313
auto IsFreeExtractInsert = [&DL, this](VectorType *VecType,
unsigned VecIndex) {
unsigned EltSize = DL.getTypeSizeInBits(VecType->getElementType());
return EltSize >= 32 ||
(EltSize == 16 && VecIndex == 0 && ST->has16BitInsts());
};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The TTI costs for the vector ops should already be reporting free, shouldn't need to reimplement this

Comment on lines +1381 to +1382
if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
if (!(SubIndex % 2)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be combined

Comment on lines +1366 to +1370
if (Shuffle->isExtractSubvectorMask(SubIndex) ||
Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
if (!(SubIndex % 2)) {
Ops.push_back(&Op);
continue;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these can be combined

@@ -0,0 +1,49 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -march=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be an IR test that does the sinking, which I assume is CodeGenPrepare

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -march=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s

define amdgpu_kernel void @runningSum(ptr addrspace(1) %out, i32 %inputElement0, i32 %inputElement1, i32 %inputIter) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment test purposes

%runningSum = add <2 x i32> %broadcast1, %previousSum
%itersLeft = sub i32 %iterCount, 1
%cond = icmp eq i32 %itersLeft, 0
br i1 %cond, label %loopExit, label %loopBody, !llvm.loop !0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this one function is stressing all of the conditions tested here

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants